# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# importing data
data = pd.read_csv('hotel_cancel.csv')
# data shape
print("data rows & columns ->",data.shape)
# data's top 5 rows
data.head()
# data's summary
data.info()
# data overview & samples (especially on nan & nan percentage)
listItem = []
for col in data.columns :
listItem.append([col, data[col].dtype, data[col].isna().sum(), round((data[col].isna().sum()/len(data[col])) * 100,2),
data[col].nunique(), list(data[col].drop_duplicates().sample(2).values)]);
dfDesc = pd.DataFrame(columns=['Data Features', 'Data Type', 'Null Count', 'Null %', 'N-Unique', 'Unique Sample'],
data=listItem)
dfDesc
# agent (null 14%) feature unique counts
data["agent"].value_counts()
# company (null 94%) feature unique counts
data["company"].value_counts()
# country (null 0.4%) feature unique counts
data["country"].value_counts()
# rows with null
len(data[data.isnull().any(axis=1)])
# rows without null
len(data[~data.isnull().any(axis=1)])
# trying to find the null combinations
pd.options.display.max_rows = None
null = []
for idx in range(len(data)):
temp = ""
for i in data.columns:
if (str(data[i].iloc[idx]).lower() == 'nan'):
temp += i+" "
else:
pass
null.append(temp)
(pd.Series(null)).value_counts()
From the observations:
# creating copied data for contigency
mydata = data.copy()
# dropping columns & then the remaining null rows
mydata = mydata.drop(['agent','company'], axis=1)
mydata = mydata.dropna()
# resetting index
mydata.reset_index(drop=True)
# data's new shape
print("before ->",data.shape)
print("after ->",mydata.shape)
# final null values checking
mydata.isnull().sum()
# basic description for numerical features
pd.options.display.max_rows = None
mydata.describe().T
# basic description for numerical features based on canceled or not
pd.options.display.max_rows = None
mydata.groupby('is_canceled').describe().T
# basic description for categorical features
pd.options.display.max_rows = None
mydata.describe(include="object").T
# basic description for categorical features based on canceled or not
pd.options.display.max_rows = None
mydata.groupby('is_canceled').describe(include="object").T
# basic description for targer feature ("is_canceled")
plt.title("Order Cancellation Count", fontdict={'fontsize': 15})
sns.countplot(data = mydata, y ='is_canceled')
# plotting pearson correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(mydata.corr(), annot=True)
In terms of Pearson Correlation:
# plotting spearman correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(mydata.corr("spearman"), annot=True)
In terms of Spearman Correlation:
# defining cramers v function to see the association between two categorical features
def cramers_v(x,y):
import scipy.stats as ss
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r -((r-1)**2)/(n-1)
kcorr = k -((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
# cramer's v value for each categorical columns towards target variable
categorical_columns = mydata.select_dtypes('object').columns
print ("Cramer's V")
for i in categorical_columns:
print("-"+i+":\n {}".format(cramers_v(mydata[i], mydata['is_canceled'])))
In terms of Cramer's V:
# total bookings per market segment
mysegments1 = mydata[mydata['is_canceled']==1]['market_segment'].value_counts()
mysegments2 = mydata[mydata['is_canceled']==0]['market_segment'].value_counts()
# pie plot for canceled orders
fig1 = px.pie(mysegments1, values= mysegments1.values, names=mysegments1.index,
title="Booking per market segment for canceled orders")
fig1.update_traces(rotation=-90, textinfo="percent+label")
fig1.show()
# pie plot for stayed orders
fig2 = px.pie(mysegments2, values= mysegments2.values, names=mysegments2.index,
title="Booking per market segment for stayed orders")
fig2.update_traces(rotation=-90, textinfo="percent+label")
fig2.show()
From the observations:
sns.set(style = "whitegrid")
plt.figure(figsize= (18,5))
# countplot for repeated guest count
plt.subplot(1,3,1)
plt.title("Repeater Guests Count", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="is_repeated_guest")
# countplot for cancellation count wth repeated guest hue
plt.subplot(1,3,2)
plt.title("Cancellation Count by Repeater Guests", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="is_repeated_guest", hue='is_canceled')
# barplot for repeated guest & cancellation
plt.subplot(1,3,3)
plt.title("Cancellation Mean by Repeater Guests", fontdict={'fontsize': 15})
sns.barplot(data=mydata, y="is_canceled", x='is_repeated_guest')
From the observations:
# plot for highest occuring lead time
mydata['lead_time'].value_counts().head(10).plot(kind="bar")
# percentage of top 2 most frequent values
print("Percentage of last-minute bookers:",sum(mydata['lead_time'].value_counts(normalize=True).head(2).values))
# percentage of the rest of values
print("Percentage of early-bookers:",sum(mydata['lead_time'].value_counts(normalize=True).values)-sum(mydata['lead_time'].value_counts(normalize=True).head(2).values))
# plotting kdeplot with cancellation as hue
sns.FacetGrid(data, hue ='is_canceled', height=5, xlim=(0,700)).map(sns.kdeplot,'lead_time', shade = True).add_legend()
# scatter plot of lead time, previous cancellations & order cancellation
fig = px.scatter(mydata, x='lead_time', y='previous_cancellations', color='is_canceled',
title="Lead-time, Previous Cancellations & Order Cancellation")
fig.update_layout(autosize=False,width=700,height=500,margin=dict(l=50,r=50,b=100,t=100,pad=4))
fig.show()
From the observations:
sns.set(style = "whitegrid")
plt.figure(figsize= (18,5))
# countplot for deposit type count
plt.subplot(1,3,1)
plt.title("Deposit Type Count", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="deposit_type")
# countplot for cancellation count wth cancellation hue
plt.subplot(1,3,2)
plt.title("Cancellation Count by Deposit Type", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="deposit_type", hue='is_canceled')
# barplot for deposit type & cancellation
plt.subplot(1,3,3)
plt.title("Cancellation Mean by Deposit Type", fontdict={'fontsize': 15})
sns.barplot(data=mydata, y="is_canceled", x='deposit_type')
# crosstab for deposit type & order cancellation
pd.crosstab(mydata['deposit_type'],mydata['is_canceled'])
# lead_time distribution by deposit_type
sns.set_style("whitegrid")
sns.FacetGrid(data=mydata, col='deposit_type').map(sns.distplot,'lead_time').add_legend()
# looking at non refund deposits yet cancelling orders
NonRefund_1 = mydata[(mydata['deposit_type']=='Non Refund') & (mydata['is_canceled']==1)]
NonRefund_1.describe().T
From the observations:
# countplot by week
sns.set(style = "whitegrid")
plt.figure(figsize=(20,6))
sns.countplot(mydata['arrival_date_week_number'])
plt.title("Room orders over the year (in weeks)", fontsize=16)
plt.xlabel("Week", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Orders", fontsize=16)
plt.show()
# ordering data by month:
monthly_adr = mydata[["hotel", "arrival_date_month", "adr","is_canceled"]].sort_values("arrival_date_month")
ordered_months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
monthly_adr["arrival_date_month"] = pd.Categorical(monthly_adr["arrival_date_month"], categories=ordered_months, ordered=True)
# barplot with standard deviation:
plt.figure(figsize=(20, 6))
sns.lineplot(x = "arrival_date_month", y="adr", hue="hotel", data=monthly_adr,
hue_order = ["City Hotel", "Resort Hotel"], ci="sd", size="hotel", sizes=(2.5, 2.5))
plt.title("Room prices over the year (in months)", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Average Daily Rate [EUR]", fontsize=16)
plt.show()
# countplot by month
plt.figure(figsize=(20,6))
# sns.countplot(data=mydata, x="deposit_type", hue='is_canceled')
sns.countplot(data=monthly_adr, x="arrival_date_month" , hue="is_canceled")
plt.title("Order Cancellations over the year (in months)", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Orders", fontsize=16)
plt.show()
From the observations:
sns.set(style = "whitegrid")
plt.figure(figsize= (18,5))
# countplot for deposit type count
plt.subplot(1,3,1)
plt.title("Customer Type Count", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="customer_type")
# countplot for cancellation count wth cancellation hue
plt.subplot(1,3,2)
plt.title("Cancellation Count by Customer Type", fontdict={'fontsize': 15})
sns.countplot(data=mydata, x="customer_type", hue='is_canceled')
# barplot for deposit type & cancellation
plt.subplot(1,3,3)
plt.title("Cancellation Mean by Customer Type", fontdict={'fontsize': 15})
sns.barplot(data=mydata, y="is_canceled", x='customer_type')
# crosstab for deposit type & order cancellation
pd.crosstab(mydata['customer_type'],mydata['is_canceled'])
# lead_time distribution by customer_type
sns.set_style("whitegrid")
sns.FacetGrid(data=mydata, col='customer_type').map(sns.distplot,'lead_time').add_legend()
# looking at contract customers yet cancelling orders
Contract_1 = mydata[(mydata['customer_type']=='Contract') & (mydata['is_canceled']==1)]
Contract_1.describe().T
From the observations:
from scipy.stats import shapiro, anderson, skew, kurtosis
for i in mydata.select_dtypes(exclude = 'object').columns:
print("[{}]".format(i))
if shapiro(mydata[i])[1] < 0.05:
print('Not-Normal Distribution')
else:
print('Normal Distribution')
print('Skewness: {}, Kurtosis {}'.format(skew(mydata[i]), kurtosis(mydata[i])))
sns.distplot(mydata[i])
plt.title(i)
plt.show()
print('\n')
From the observations:
# making country names into domestic & international
def tourist(cols):
if (cols == 'PRT'):
cols = 'Domestic'
else:
cols = 'International'
return cols
mydata['country'] = mydata['country'].apply(tourist)
mydata['country'].value_counts().head(10)
# streamlining the deposit type into two categories only (0=Flexible; 1=Inflexible)
def yes(cols):
if (cols == "No Deposit") or (cols == "Refundable"):
cols = 0
else:
cols = 1
return cols
mydata['deposit_type'] = mydata['deposit_type'].apply(yes)
mydata['deposit_type'].value_counts()
# taking relevant features into model
mycoba = mydata[['hotel','adr','country','lead_time','previous_cancellations','previous_bookings_not_canceled','required_car_parking_spaces','total_of_special_requests',
'children','adults','customer_type','deposit_type','market_segment','is_repeated_guest','is_canceled']]
# model features
mycoba.info()
# making dummies for selected categorical columns
mydummy = pd.get_dummies(data= mycoba, drop_first= True, columns = ['hotel','country','customer_type','deposit_type','market_segment'] )
mydummy.columns
# assigning features
IV = ['adr', 'lead_time', 'previous_cancellations',
'previous_bookings_not_canceled', 'required_car_parking_spaces',
'total_of_special_requests', 'children', 'adults', 'is_repeated_guest',
'hotel_Resort Hotel', 'country_International',
'customer_type_Group', 'customer_type_Transient',
'customer_type_Transient-Party', 'deposit_type_1',
'market_segment_Complementary', 'market_segment_Corporate',
'market_segment_Direct', 'market_segment_Groups',
'market_segment_Offline TA/TO', 'market_segment_Online TA']
# independent variables
x = mydummy[IV]
# dependent/target variable
y = mydummy['is_canceled']
# splitting the training data - test data = 80% : 20%
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 101)
# create a new x_train & y_train variable (x_trainres & y_trainres) that is resampled using SMOTE method
from imblearn.over_sampling import SMOTE
from collections import Counter
y_train = y_train.astype('int')
smo = SMOTE(random_state=0, sampling_strategy='minority')
x_trainres, y_trainres = smo.fit_resample(x_train, y_train)
print("normal data (target) :",sorted(Counter(y_train).items()))
print("oversampled data (target) :",sorted(Counter(y_trainres).items()))
# importing libraries & algorithms to be compared
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
# to feed the random state
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append(('XGB', XGBClassifier()))
models.append(('LR2', LogisticRegression()))
models.append(('DT2', DecisionTreeClassifier()))
models.append(('RF2', RandomForestClassifier()))
models.append(('NB2', BernoulliNB()))
models.append(('XGB2', XGBClassifier()))
# scoring
results = []
names = []
scoring = 'accuracy'
# evaluate each model in turn for normal & oversampled data
for name, model in models:
kfold = KFold(n_splits=10, random_state=seed)
if "2" in name:
cv_results = cross_val_score(model, x_trainres, y_trainres, cv=kfold, scoring=scoring)
else:
cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison for normal & oversampled data
fig = plt.figure(figsize=(20,6))
fig.suptitle('Algorithm Comparison - On Accuracy')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append(('XGB', XGBClassifier()))
models.append(('LR2', LogisticRegression()))
models.append(('DT2', DecisionTreeClassifier()))
models.append(('RF2', RandomForestClassifier()))
models.append(('NB2', BernoulliNB()))
models.append(('XGB2', XGBClassifier()))
# scoring
results = []
names = []
scoring = 'recall'
# evaluate each model in turn for normal & oversampled data
for name, model in models:
kfold = KFold(n_splits=10, random_state=seed)
if "2" in name:
cv_results = cross_val_score(model, x_trainres, y_trainres, cv=kfold, scoring=scoring)
else:
cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison for normal & oversampled data
fig = plt.figure(figsize=(20,6))
fig.suptitle('Algorithm Comparison - On Recall')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append(('XGB', XGBClassifier()))
models.append(('LR2', LogisticRegression()))
models.append(('DT2', DecisionTreeClassifier()))
models.append(('RF2', RandomForestClassifier()))
models.append(('NB2', BernoulliNB()))
models.append(('XGB2', XGBClassifier()))
# scoring
results = []
names = []
scoring = 'f1'
# evaluate each model in turn for normal & oversampled data
for name, model in models:
kfold = KFold(n_splits=10, random_state=seed)
if "2" in name:
cv_results = cross_val_score(model, x_trainres, y_trainres, cv=kfold, scoring=scoring)
else:
cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison for normal & oversampled data
fig = plt.figure(figsize=(20,6))
fig.suptitle('Algorithm Comparison - On F1 Score')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', BernoulliNB()))
models.append(('XGB', XGBClassifier()))
models.append(('LR2', LogisticRegression()))
models.append(('DT2', DecisionTreeClassifier()))
models.append(('RF2', RandomForestClassifier()))
models.append(('NB2', BernoulliNB()))
models.append(('XGB2', XGBClassifier()))
# scoring
results = []
names = []
# evaluate each model in turn for normal & oversampled data
for name, model in models:
if "2" in name:
cv_results = cross_val_score(model, x_trainres, y_trainres, cv=10, n_jobs=-3, scoring = 'roc_auc')
else:
cv_results = cross_val_score(model, x_train, y_train, cv=10, n_jobs=-3, scoring= 'roc_auc')
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison for normal & oversampled data
fig = plt.figure(figsize=(20,6))
fig.suptitle('Algorithm Comparison - On ROC-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# model fitting for normal data
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=7)
rfc.fit(x_train, y_train)
# model fitting for oversampled data
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(random_state=7)
rfc2.fit(x_trainres, y_trainres)
# importing relevant libraries
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
# normal - classification report for train & test data
print("### TRAIN DATA (NORMAL)")
print(classification_report(y_train, rfc.predict(x_train)))
rfc_pred = rfc.predict(x_test)
print("### TEST DATA (NORMAL)")
print(classification_report(y_test, rfc_pred))
# oversampled - classification report for train & test data
print("### TRAIN DATA (OVERSAMPLED)")
print(classification_report(y_trainres, rfc2.predict(x_trainres)))
rfc_pred2 = rfc2.predict(x_test)
print("### TEST DATA (OVERSAMPLED)")
print(classification_report(y_test, rfc_pred2))
# confusion matrix for normal data
cnf_matrix = confusion_matrix(y_test, rfc_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='g')
plt.tight_layout()
plt.title('Confusion Matrix - Normal')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# confusion matrix for oversampled data
cnf_matrix = confusion_matrix(y_test, rfc_pred2)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='g')
plt.tight_layout()
plt.title('Confusion Matrix - Oversampled')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# ROC-AUC Score
from sklearn import metrics
for i in range(2):
if i == 0:
pred_proba = rfc.predict_proba(x_test)[::,1]
elif i == 1:
pred_proba = rfc2.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred_proba)
auc = metrics.roc_auc_score(y_test, pred_proba)
roc_auc = metrics.auc(fpr, tpr)
plt.figure(figsize= (10,5))
if i == 0:
plt.title('Receiver Operator Characteristic - Normal')
elif i == 1:
plt.title('Receiver Operator Characteristic - Oversampled')
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))
plt.plot(fpr, tpr, 'b', label= 'Rounded AUC = {}'.format(round(roc_auc,2)))
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
From the observations:
# feature importance for normal data
rfc_coef1 = pd.Series(rfc.feature_importances_, x_train.columns).sort_values(ascending= False)
rfc_coef1.plot(kind = 'bar', title='Feature Importances - Normal Data')
plt.show()
# feature importance for oversampled data
rfc_coef2 = pd.Series(rfc2.feature_importances_, x_trainres.columns).sort_values(ascending= False)
rfc_coef2.plot(kind = 'bar', title='Feature Importances - Oversampled Data')
plt.show()
From the observations:
# permutation importance for normal data
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
rfcperm = PermutationImportance(rfc, scoring = 'roc_auc', random_state= 7).fit(x_test, y_test)
show_weights(rfcperm, feature_names = list(x_test.columns))
# permutation importance for oversampled data
rfcperm2 = PermutationImportance(rfc2, scoring = 'roc_auc', random_state= 7).fit(x_test, y_test)
show_weights(rfcperm2, feature_names = list(x_test.columns))
From the observations:
## GRIDSEARCHCV
from sklearn.model_selection import GridSearchCV
grid1 = GridSearchCV(estimator = rfc2,
refit = 'recall',
param_grid = {'random_state': np.arange(1,101)},
scoring = 'recall',
cv = 5, n_jobs = -1)
grid1.fit(x_trainres, y_trainres)
# best parameters
grid1.best_params_
# model fitting for oversampled data
from sklearn.ensemble import RandomForestClassifier
rfc3 = RandomForestClassifier(random_state=16)
rfc3.fit(x_trainres, y_trainres)
# oversampled - classification report for train & test data
print("### TRAIN DATA (OVERSAMPLED)")
print(classification_report(y_trainres, rfc3.predict(x_trainres)))
rfc_pred3 = rfc3.predict(x_test)
print("### TEST DATA (OVERSAMPLED)")
print(classification_report(y_test, rfc_pred3))
# confusion matrix
cnf_matrix = confusion_matrix(y_test, rfc_pred3)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='g')
plt.tight_layout()
plt.title('Confusion Matrix - RF3')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# ROC-AUC Score
from sklearn import metrics
pred_proba = rfc3.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred_proba)
auc = metrics.roc_auc_score(y_test, pred_proba)
roc_auc = metrics.auc(fpr, tpr)
plt.figure(figsize= (10,5))
plt.title('Receiver Operator Characteristic - RF3')
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))
plt.plot(fpr, tpr, 'b', label= 'Rounded AUC = {}'.format(round(roc_auc,2)))
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
grid2 = GridSearchCV(estimator = rfc2,
refit = 'recall',
param_grid = {
'n_estimators':[1000,1800],
'bootstrap': [True,False],
'max_features': ['sqrt','auto'],
'max_depth': [80,None],
},
scoring = 'recall',
cv = 5, n_jobs = -1)
grid2.fit(x_trainres, y_trainres)
# best parameters
grid2.best_params_
# model fitting for oversampled data
from sklearn.ensemble import RandomForestClassifier
rfc4 = RandomForestClassifier(bootstrap=True, max_depth=80, max_features= 'sqrt', n_estimators= 1000)
rfc4.fit(x_trainres, y_trainres)
# oversampled - classification report for train & test data
print("### TRAIN DATA (OVERSAMPLED)")
print(classification_report(y_trainres, rfc4.predict(x_trainres)))
rfc_pred4 = rfc4.predict(x_test)
print("### TEST DATA (OVERSAMPLED)")
print(classification_report(y_test, rfc_pred4))
# confusion matrix
cnf_matrix = confusion_matrix(y_test, rfc_pred4)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='g')
plt.tight_layout()
plt.title('Confusion Matrix - RF4')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# ROC-AUC Score
from sklearn import metrics
pred_proba = rfc4.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred_proba)
auc = metrics.roc_auc_score(y_test, pred_proba)
roc_auc = metrics.auc(fpr, tpr)
plt.figure(figsize= (10,5))
plt.title('Receiver Operator Characteristic - RF4')
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))
plt.plot(fpr, tpr, 'b', label= 'Rounded AUC = {}'.format(round(roc_auc,2)))
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
From the observations:
# let's benchmark our tuned model performance with AutoML
from tpot import TPOTClassifier
tpot = TPOTClassifier(subsample = 0.8, verbosity = 2, warm_start=True, early_stop=20, max_time_mins= 60, n_jobs= -2)
# fitting TPOT to our data
tpot.fit(x_train, y_train)
# exporting our model results
tpot.export('tpot_HOTEL.py')
# using our exported AutoML model ('tpot_HOTEL.py') to our data
import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
# fitting the AutoML for our data
autoML = make_pipeline(
FastICA(tol=0.0),
KNeighborsClassifier(n_neighbors=57, p=1, weights="distance"))
autoML.fit(x_train, y_train)
# normal - classification report for train & test data
print("### TRAIN DATA")
print(classification_report(y_train, autoML.predict(x_train)))
autoML_pred = autoML.predict(x_test)
print("### TEST DATA")
print(classification_report(y_test, autoML_pred))
# confusion matrix for normal data
cnf_matrix = confusion_matrix(y_test, autoML_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, fmt='g')
plt.tight_layout()
plt.title('Confusion Matrix - AutoML')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# ROC-AUC Score
from sklearn import metrics
pred_proba = autoML.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, pred_proba)
auc = metrics.roc_auc_score(y_test, pred_proba)
roc_auc = metrics.auc(fpr, tpr)
plt.figure(figsize= (10,5))
plt.title('Receiver Operator Characteristic - AutoML')
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))
plt.plot(fpr, tpr, 'b', label= 'Rounded AUC = {}'.format(round(roc_auc,2)))
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
From the observations:
# RF2
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=rfc2,
X=x,
y=y,
train_sizes=np.linspace(0.5, 1.0, 5),
cv=10)
# Mean value of accuracy against training data
train_mean = np.mean(train_scores, axis=1)
print('train mean: ')
print(train_mean)
# Standard deviation of training accuracy per number of training samples
train_std = np.std(train_scores, axis=1)
# Same as above for test data
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
print('test mean: ')
print(test_mean)
# Plot training accuracies
plt.plot(train_sizes, train_mean, color='red', marker='o', label='Training Accuracy')
# Plot the variance of training accuracies
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='red')
# Plot for test data as training data
plt.plot(train_sizes, test_mean, color='blue', linestyle='--', marker='s',
label='Test Accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='blue')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
# RF4
train_sizes, train_scores, test_scores = learning_curve(estimator=rfc4,
X=x,
y=y,
train_sizes=np.linspace(0.5, 1.0, 5),
cv=10)
# Mean value of accuracy against training data
train_mean = np.mean(train_scores, axis=1)
print('train mean: ')
print(train_mean)
# Standard deviation of training accuracy per number of training samples
train_std = np.std(train_scores, axis=1)
# Same as above for test data
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
print('test mean: ')
print(test_mean)
# Plot training accuracies
plt.plot(train_sizes, train_mean, color='red', marker='o', label='Training Accuracy')
# Plot the variance of training accuracies
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='red')
# Plot for test data as training data
plt.plot(train_sizes, test_mean, color='blue', linestyle='--', marker='s',
label='Test Accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='blue')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
From the observations:
# saving algorithm for further usage
import pickle
filename = 'hotel_cancellation_tuned.sav';
pickle.dump(rfc4, open(filename, 'wb'))
Through all the observations, there are at many features that keeps showing significance and they are lead_time, adr, market_Ssegment, country, previous_cancellations and deposit_type features, with lead_time perhaps as the most important feature in relation to cancelling orders ('is_canceled'). In other words, these features could be the features that has better explanatory relations towards our target variable of cancelling orders and may prove to be important for further observations & practical applications.
In overall, the best model for our prediction is RandomForestClassifier-Oversampled that's hyperparameter-tuned (RF4). It even gives better results than the autoML model, especially in its Recall score since we're trying to minimize False Negatives in our predictions. This model may be useful for further descriptions & deployments.